Introduction to the datasets

head(diamonds)
## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.230 Ideal     E     SI2      61.5  55.0   326  3.95  3.98  2.43
## 2 0.210 Premium   E     SI1      59.8  61.0   326  3.89  3.84  2.31
## 3 0.230 Good      E     VS1      56.9  65.0   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4  58.0   334  4.20  4.23  2.63
## 5 0.310 Good      J     SI2      63.3  58.0   335  4.34  4.35  2.75
## 6 0.240 Very Good J     VVS2     62.8  57.0   336  3.94  3.96  2.48
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
ggplot(
  data=diamonds,
  aes(
    x=clarity
    )
  )+
  geom_bar()

ggplot(data=diamonds,aes(x=clarity))+
  geom_bar()

Now let’s make it look nicer, Note the fill aesthetic was added.

#define aesthetics
ggplot(data=diamonds,aes(x=clarity,fill=cut))+
  geom_bar()

See what happens when you assign fill within the geom, below, vs withing the top-level, default aesthetic mapping defined in ggplot(), as above.

The assigned to all bars the color gold.

#aesthetics outside aes
ggplot(data=diamonds,aes(x=clarity))+
  geom_bar(fill='gold')

Note what happens now when you assign the asthetic mapping to a color inside the geom.

ggplot(data=diamonds,aes(x=clarity))+
  geom_bar(aes(fill='gold')) #should be outside aes!

Another area for confusion, the difference between fill and color.

In geom_bar, there’s a fill color (fill), and outter color (color)

Note in the first plot, color is outlining the bar.

#fill/color
ggplot(data=diamonds,aes(x=clarity,color=cut))+
  geom_bar()

ggplot(data=mtcars,aes(factor(cyl),fill=factor(cyl)))+
  geom_bar(color='black')

ggplot(data=mtcars,aes(factor(cyl),color=factor(cyl)))+
  geom_bar()

Now let’s talk about points.

Run help(points) in your console to see your points optoin.

Now let’s look at different, non-default color schemes.

Note we’re adding scale_fill_... options.

#other color systems
(plottemp <- ggplot(data=diamonds,aes(x=clarity,fill=cut))+
    geom_bar())

plottemp + scale_fill_brewer(palette='Set2') #in-built

plottemp + scale_fill_grey(name = "cut of\ndiamonds")

plottemp + scale_fill_grey(name = "cut of\ndiamonds") + theme_bw()

Above, also note we assigned the base plot to the name plottemp. This object is all the data required to create this plot. Be aware this can be quite big if the data you’re dealing with is quite big.

Some more scales

# ggsci ####
plottemp+scale_fill_lancet()

plottemp+scale_fill_jama()

plottemp+scale_fill_startrek()

plottemp+scale_fill_economist()

How do we save ggplots?

The following saves the plot in your last created plot to your current working director, as a file named ggtestplot.png.

#save ggplots
ggsave('ggtestplot.png',width=20,height=20,
       units='cm',dpi=150)
# alternative:
# png('ggtestplot.png',width = 20,height = 20,units = 'cm',res = 150)
# plottemp
# dev.off()
# scatterplot
#other geoms
ggplot(data=mtcars,aes(x = wt,y = mpg))+
  geom_point() #default 1.5 IQR

ggplot(mtcars,aes(x = factor(gear),y = wt))+
  geom_boxplot()

ggplot(mtcars,aes(x = factor(gear),y = wt))+
  geom_boxplot(coef=3)

ggplot(mtcars,aes(x = factor(gear),y = wt,fill=factor(cyl)))+
  geom_boxplot(coef=3)

ggplot(mtcars,aes(x = factor(gear),y = wt))+
  geom_boxplot(coef=3)+
  geom_point()

#jitter just randomly moves the points around. this is useful if you want to visually get a sense of the distribution of these points. This is particularly useful with a continuous variable that can be grouped into a factor class. 
ggplot(mtcars,aes(x = factor(gear),y = wt))+
  geom_boxplot(coef=3)+
  geom_point(position = position_jitter(width = .1))

ggplot(mtcars,aes(x = factor(gear),y = wt))+
  geom_boxplot(coef=3)+
  geom_dotplot(alpha=.7,
               binaxis = 'y',stackdir = 'center',
               stackratio = .7,dotsize = .6)

Now let’s work with a scatter plot.

#aesthetics again (finetuning)
ggplot(data=mtcars,aes(wt, mpg,color=qsec))+
  geom_point(size=4) #outside aes!

ggplot(data=mtcars,aes(wt, mpg,color=qsec, size=carb))+
  geom_point()

ggplot(data=mtcars,aes(wt, mpg,color=qsec, size=carb))+
  scale_color_gradient(low='darkred',high='blue')+
  geom_point()

ggplot(data=mtcars,aes(wt, mpg,color=qsec, size=carb))+
  scale_color_gradient2(low='red',high='yellow',
                        mid='blue',
                        limits=c(13,23),midpoint=18)+
  geom_point()

# use different aesthetic mappings
ggplot(data=mtcars,
       aes(wt, mpg,size=qsec, shape=factor(cyl)))+
  geom_point()

ggplot(data=mtcars,aes(carb, mpg,color=qsec, size=wt))+
  geom_point(shape=2)

#location, location,position
p<-ggplot(data=diamonds,aes(clarity,fill=cut))
p+geom_bar(position="stack")

p+geom_bar(position="dodge")

p+geom_bar(position="fill")

p+geom_bar(position="fill")+
  scale_y_continuous('Frequency (%)',
                     breaks=seq(0,1,.2),
                     labels=seq(0,100,20))

p+geom_bar(position="identity")

p+geom_bar(position="identity",alpha=.5)

ggplot(data=diamonds,aes(clarity,color=cut, group=cut))+
  geom_freqpoly(stat='count',position="identity",lwd=1.5)+
  geom_point(stat='count',size=5)

#layer/order / computed geoms
ggplot(data=mtcars,aes(wt, mpg))+
  geom_point(size=4)+
  geom_smooth(size=3)

ggplot(data=mtcars,aes(wt, mpg))+
  geom_smooth(size=3)+
  geom_point(size=4)

ggplot(data=mtcars,aes(wt, mpg))+
  geom_smooth(size=3,color='red')+
  geom_smooth(method='lm',size=3)+
  geom_point(size=4)

ggplot(data=mtcars,aes(wt, mpg,color=factor(cyl)))+
  geom_point(size=4)+
  geom_smooth(method='lm',size=1)

ggplot(data=mtcars,aes(wt, mpg,color=factor(cyl),
                       shape=factor(am)))+
  geom_point(size=2)+
  geom_smooth(method='lm',size=1,se=F)

#? lm for all?
ggplot(data=mtcars,aes(wt, mpg))+
  geom_smooth(size=1,color='black',fill='yellow')+
  geom_point(size=3,aes(color=factor(cyl),shape=factor(am))) #aes for geom only

# facet_wrap / facet_grid
(p.tmp <- ggplot(mtcars, aes(mpg, wt)) + geom_point())

p.tmp + facet_wrap(~cyl)

p.tmp + facet_wrap(~cyl, ncol=2)

p.tmp + facet_grid(gear~cyl,labeller=label_both,margins='gear')

p.tmp + geom_smooth()+
  facet_grid(gear~cyl, labeller=label_both,margins=T)

p.tmp + facet_wrap(~cyl+gear,labeller=label_both) # empty combination is dropped

# controlling scales in facets (default: scales="fixed")
p.tmp + facet_grid(gear~cyl, scales="fixed")

p.tmp + facet_grid(gear~cyl, scales="free")

p.tmp + facet_grid(gear~cyl, scales="free_x")

p.tmp + facet_grid(gear~cyl, scales="free_y")

# summaries ####
(plottemp <- ggplot(mtcars,aes(factor(gear),mpg))+
   geom_point())

plottemp+stat_summary(color='red')

plottemp+stat_summary(fun.data='mean_cl_boot',color='red') #create your own summary function

# ggsign ####
p <- round(wilcox.test(mtcars$mpg~mtcars$am)$p.value,5)
(plottemp <- ggplot(mtcars,aes(as.factor(am),mpg))+
    geom_boxplot())

plottemp+geom_signif(comparisons=list(c(1,2)),
                     # aes(y=0), 
                     textsize = 3, vjust = 0.0, 
                     y_position=max(mtcars$mpg+3),
                     annotations=paste0('p =', p),
                     tip_length=.01)

plottemp + geom_signif(comparisons=list(1:2),vjust=0)

old <- theme_set(theme_wsj())
ggplot(data=diamonds,aes(x=clarity,fill=cut))+
  geom_bar()

theme_update(legend.position="bottom",
             axis.text=element_text(colour = "darkblue",
                                    size=12),
             axis.title=element_text(size=15),
             plot.margin=unit(c(3,4,.5,3),'lines'),    #o,r,u,l
             axis.title.y=element_text(vjust=0.4,angle=90),
             legend.key.size=unit(2.5,'lines'),
             panel.background=element_rect(fill='darkgrey'),
             panel.grid.minor = element_line(colour='white'),
             panel.grid.major = element_line(color='red',size = 2),
             legend.text = element_text(size = 8))
ggplot(data=diamonds,aes(x=clarity,fill=cut))+
  geom_bar()

theme_set(old)
ggplot(data=diamonds,aes(x=clarity,fill=cut))+
  geom_bar()

# ggthemes ####
plottemp+theme_economist()

plottemp+theme_excel()

#manipulate plots from packages
fit<- survfit(Surv(time, status) ~ sex, data = lung)
ggsurvplot(fit)

(ggsurv <- ggsurvplot(fit,pval=F,risk.table='absolute',
                      fun='pct',cumevents=F, censor.shape="|",
                      tables.height=.25,palette=pal_jama("default")(2),
                      ylim=c(05,100),xlim=c(0,1000),break.time.by=250,
                      axes.offset=T))

ggsurv$plot<-ggsurv$plot+
  annotate('text',x=700,y=92,size=rel(5),
           label=surv_pvalue(fit,method='n')$pval.txt)+
  geom_vline(xintercept=0)+
  theme(axis.line.y=element_blank(),
        panel.grid.major.y=element_line(color='grey'),
        axis.text.y=element_text(hjust=1,
                                 margin=margin(r=0)),
        axis.ticks.y=element_blank())
ggsurv

ggsurv$table <- ggsurv$table + 
  # scale_x_continuous(breaks=seq(0,180,30),expand=c(0,0))+
  theme(
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.line.y=element_blank(),
    plot.title=element_text(size=rel(1.1)))
ggsurv

#Regression trees
gg_rtree<-function(rpartdata=rpart_out,miny=NULL,
                   title='')
{
  t<-ggdendro::dendro_data(rpartdata,uniform=T,compress=T)
  if(is.numeric(t$leaf_labels$label))
  {
    t$leaf_labels$label<-prettyNum(as.numeric(t$leaf_labels$label))
  }
  t$leaf_labels<-arrange(t$leaf_labels,x)
  if(is.null(miny)) {miny<-c(min(t$leaf_labels$y),min(t$leaf_labels$y)/2)}
  t$labels$label<-enc2utf8(as.character(t$labels$label))
  t$labels$label<-stringr::str_replace(pattern='>=',replacement=' \u2265 ',
                                       string=gsub('<',' <',t$labels$label))
  t$segments$leaf<-'no'
  for (i in 1:nrow(t$segments))
  {
    if(t$segments$y[i]>t$segments$yend[i])
    {
      temp<-t$segments$y[i]
      t$segments$y[i]<-t$segments$yend[i]
      t$segments$yend[i]<-temp
    }
    
  }
  for(i in 1:nrow(t$leaf_labels))
  {
    t$segments$y[which(t$segments$x==t$leaf_labels$x[i] &
                         t$segments$y==t$leaf_labels$y[i])]<-
      ifelse(i%%2==0,miny[1],miny[2])
    t$leaf_labels$y[i]<-
      ifelse(i%%2==0,miny[1],miny[2])
    t$segments$leaf[which(t$segments$x==t$leaf_labels$x[i] &
                            t$segments$y==t$leaf_labels$y[i])]<-'yes'
  }
  #yes/no edges
  t$yes<-t$segments[which(t$segments$y==t$segments$yend),1:3]
  t$yes$label<-'yes'
  t$no<-t$segments[which(t$segments$y==t$segments$yend),c(1,3:4)]
  colnames(t$no)[3]<-'x'
  t$no$label<-'no'
  
  # t$leaf_labels$y<-c(min(t$leaf_labels$y),0.05)
  tree<-ggplot(t$segments)+
    ggtitle(title)+
    geom_segment(aes(x,y,xend=xend,yend=yend,
                     linetype=leaf,size=leaf))+
    scale_size_manual(values=c(.5,1.25),guide=F)+
    scale_linetype_manual(values=c(1,2),guide=F)+
    geom_label(data=t$labels,aes(x,y,label=gsub('>=',' \u2265 ',label)),
               vjust=-.25,hjust=.5,fontface='bold')+
    geom_label(data=t$yes,aes(x,y,label=label),vjust=1,size=3,
               label.r=unit(.5,units='lines'),color='white',fill='black',
               fontface='bold')+
    geom_label(data=t$no,aes(x,y,label=label),vjust=1,size=3,
               label.r=unit(.5,units='lines'),color='white',fill='black',
               fontface='bold')+
    geom_label(data=t$leaf_labels,aes(x,y,label=label),vjust=1,
               label.r=unit(0,'lines'),fontface='bold',fill='lightgrey')+
    scale_y_continuous(limits=c(0,max(t$labels$y)*1.1))+
    ggdendro::theme_dendro()+
    theme(plot.title = element_text(hjust = 0.5,size=rel(2)))
  return(tree)
}

rpart_out <- rpart(mpg~factor(am)+wt+hp,data=mtcars,minsplit=5)
prp(rpart_out)

gg_rtree(rpart_out,title = 'mpg prediction')

https://rud.is/books/creating-ggplot2-extensions/demystifying-ggplot2.html